{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "name": "ProtBert-TF.ipynb",
      "provenance": [],
      "authorship_tag": "ABX9TyMLisnqYnsKug8W2AkTPCr3",
      "include_colab_link": true
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "accelerator": "GPU",
    "widgets": {
      "application/vnd.jupyter.widget-state+json": {
        "79e0d3d58fa2429ebc16b0acc1033ff6": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HBoxModel",
          "state": {
            "_view_name": "HBoxView",
            "_dom_classes": [],
            "_model_name": "HBoxModel",
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "box_style": "",
            "layout": "IPY_MODEL_97780d7d73c448e38539ae377add5f41",
            "_model_module": "@jupyter-widgets/controls",
            "children": [
              "IPY_MODEL_6db1e1a1771941629e9415b0a58062b2",
              "IPY_MODEL_d8e2a43d8ee7431bb71e7df91cb2ad96"
            ]
          }
        },
        "97780d7d73c448e38539ae377add5f41": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "6db1e1a1771941629e9415b0a58062b2": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "FloatProgressModel",
          "state": {
            "_view_name": "ProgressView",
            "style": "IPY_MODEL_fa3841c7d34649d49eb302bb5617fc30",
            "_dom_classes": [],
            "description": "Downloading: 100%",
            "_model_name": "FloatProgressModel",
            "bar_style": "success",
            "max": 81,
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "value": 81,
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "orientation": "horizontal",
            "min": 0,
            "description_tooltip": null,
            "_model_module": "@jupyter-widgets/controls",
            "layout": "IPY_MODEL_3bd68f67d5094c958b1f751e0cdb606a"
          }
        },
        "d8e2a43d8ee7431bb71e7df91cb2ad96": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HTMLModel",
          "state": {
            "_view_name": "HTMLView",
            "style": "IPY_MODEL_ef259f39f6de43378ab5c5852802ab9d",
            "_dom_classes": [],
            "description": "",
            "_model_name": "HTMLModel",
            "placeholder": "​",
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "value": " 81.0/81.0 [02:07&lt;00:00, 1.58s/B]",
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "description_tooltip": null,
            "_model_module": "@jupyter-widgets/controls",
            "layout": "IPY_MODEL_d9f6f1933058405b9f23c8122ed4fb56"
          }
        },
        "fa3841c7d34649d49eb302bb5617fc30": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "ProgressStyleModel",
          "state": {
            "_view_name": "StyleView",
            "_model_name": "ProgressStyleModel",
            "description_width": "initial",
            "_view_module": "@jupyter-widgets/base",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.2.0",
            "bar_color": null,
            "_model_module": "@jupyter-widgets/controls"
          }
        },
        "3bd68f67d5094c958b1f751e0cdb606a": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "ef259f39f6de43378ab5c5852802ab9d": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "DescriptionStyleModel",
          "state": {
            "_view_name": "StyleView",
            "_model_name": "DescriptionStyleModel",
            "description_width": "",
            "_view_module": "@jupyter-widgets/base",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.2.0",
            "_model_module": "@jupyter-widgets/controls"
          }
        },
        "d9f6f1933058405b9f23c8122ed4fb56": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "31c0799296544e468ec11ef1b0d816c9": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HBoxModel",
          "state": {
            "_view_name": "HBoxView",
            "_dom_classes": [],
            "_model_name": "HBoxModel",
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "box_style": "",
            "layout": "IPY_MODEL_8a930b8625b646d594a507520d1b4f8d",
            "_model_module": "@jupyter-widgets/controls",
            "children": [
              "IPY_MODEL_08de47592ebe4968bb17944a090c8ade",
              "IPY_MODEL_3ad550e4c8f54a78b9d7b3df41f835e0"
            ]
          }
        },
        "8a930b8625b646d594a507520d1b4f8d": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "08de47592ebe4968bb17944a090c8ade": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "FloatProgressModel",
          "state": {
            "_view_name": "ProgressView",
            "style": "IPY_MODEL_88005a24c63f49f5b2dc13e1e7211084",
            "_dom_classes": [],
            "description": "Downloading: 100%",
            "_model_name": "FloatProgressModel",
            "bar_style": "success",
            "max": 313,
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "value": 313,
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "orientation": "horizontal",
            "min": 0,
            "description_tooltip": null,
            "_model_module": "@jupyter-widgets/controls",
            "layout": "IPY_MODEL_76fb14cfc1e24edf9c5ac6bb92e2b99c"
          }
        },
        "3ad550e4c8f54a78b9d7b3df41f835e0": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HTMLModel",
          "state": {
            "_view_name": "HTMLView",
            "style": "IPY_MODEL_af1acacd875140bba4d26e8429757c84",
            "_dom_classes": [],
            "description": "",
            "_model_name": "HTMLModel",
            "placeholder": "​",
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "value": " 313/313 [00:02&lt;00:00, 133B/s]",
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "description_tooltip": null,
            "_model_module": "@jupyter-widgets/controls",
            "layout": "IPY_MODEL_659df700831b424ca944c774853346b0"
          }
        },
        "88005a24c63f49f5b2dc13e1e7211084": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "ProgressStyleModel",
          "state": {
            "_view_name": "StyleView",
            "_model_name": "ProgressStyleModel",
            "description_width": "initial",
            "_view_module": "@jupyter-widgets/base",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.2.0",
            "bar_color": null,
            "_model_module": "@jupyter-widgets/controls"
          }
        },
        "76fb14cfc1e24edf9c5ac6bb92e2b99c": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "af1acacd875140bba4d26e8429757c84": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "DescriptionStyleModel",
          "state": {
            "_view_name": "StyleView",
            "_model_name": "DescriptionStyleModel",
            "description_width": "",
            "_view_module": "@jupyter-widgets/base",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.2.0",
            "_model_module": "@jupyter-widgets/controls"
          }
        },
        "659df700831b424ca944c774853346b0": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "6fa3d541fcb749aebb160dac63ed9d6f": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HBoxModel",
          "state": {
            "_view_name": "HBoxView",
            "_dom_classes": [],
            "_model_name": "HBoxModel",
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "box_style": "",
            "layout": "IPY_MODEL_2a7346585d874cddba907dbc4f71ea55",
            "_model_module": "@jupyter-widgets/controls",
            "children": [
              "IPY_MODEL_8e031382ee764aa7bca0512ecf79b47f",
              "IPY_MODEL_028b163381ae47beaf4f4d072db062d7"
            ]
          }
        },
        "2a7346585d874cddba907dbc4f71ea55": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "8e031382ee764aa7bca0512ecf79b47f": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "FloatProgressModel",
          "state": {
            "_view_name": "ProgressView",
            "style": "IPY_MODEL_10c837776662474a815b84d68690d163",
            "_dom_classes": [],
            "description": "Downloading: 100%",
            "_model_name": "FloatProgressModel",
            "bar_style": "success",
            "max": 1684058277,
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "value": 1684058277,
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "orientation": "horizontal",
            "min": 0,
            "description_tooltip": null,
            "_model_module": "@jupyter-widgets/controls",
            "layout": "IPY_MODEL_2d72e1c83f344494aa043d018687e45c"
          }
        },
        "028b163381ae47beaf4f4d072db062d7": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HTMLModel",
          "state": {
            "_view_name": "HTMLView",
            "style": "IPY_MODEL_9c626019a3b64bc5adcd7b0c3040144d",
            "_dom_classes": [],
            "description": "",
            "_model_name": "HTMLModel",
            "placeholder": "​",
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "value": " 1.68G/1.68G [02:00&lt;00:00, 14.0MB/s]",
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "description_tooltip": null,
            "_model_module": "@jupyter-widgets/controls",
            "layout": "IPY_MODEL_9bd59746015f43d588361f897b882eaa"
          }
        },
        "10c837776662474a815b84d68690d163": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "ProgressStyleModel",
          "state": {
            "_view_name": "StyleView",
            "_model_name": "ProgressStyleModel",
            "description_width": "initial",
            "_view_module": "@jupyter-widgets/base",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.2.0",
            "bar_color": null,
            "_model_module": "@jupyter-widgets/controls"
          }
        },
        "2d72e1c83f344494aa043d018687e45c": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "9c626019a3b64bc5adcd7b0c3040144d": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "DescriptionStyleModel",
          "state": {
            "_view_name": "StyleView",
            "_model_name": "DescriptionStyleModel",
            "description_width": "",
            "_view_module": "@jupyter-widgets/base",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.2.0",
            "_model_module": "@jupyter-widgets/controls"
          }
        },
        "9bd59746015f43d588361f897b882eaa": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        }
      }
    }
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "view-in-github",
        "colab_type": "text"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/github/agemagician/ProtTrans/blob/master/Embedding/TensorFlow/Advanced/ProtBert.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "3PubGNU0dZmr"
      },
      "source": [
        "<h3> Extracting protein sequences' features using ProtBert pretrained-model <h3>"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "SAV0QHxfdZmt"
      },
      "source": [
        "<b>1. Load necessry libraries including huggingface transformers<b>"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "C34oYIVxdeab",
        "outputId": "79f696ff-e595-489a-fa98-d87852e4805f",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 102
        }
      },
      "source": [
        "!pip install -q transformers"
      ],
      "execution_count": 1,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "\u001b[K     |████████████████████████████████| 1.1MB 2.8MB/s \n",
            "\u001b[K     |████████████████████████████████| 1.1MB 17.5MB/s \n",
            "\u001b[K     |████████████████████████████████| 3.0MB 20.9MB/s \n",
            "\u001b[K     |████████████████████████████████| 890kB 46.1MB/s \n",
            "\u001b[?25h  Building wheel for sacremoses (setup.py) ... \u001b[?25l\u001b[?25hdone\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "3NJPeESYdZmw"
      },
      "source": [
        "import tensorflow as tf\n",
        "from transformers import TFBertModel, BertTokenizer,BertConfig\n",
        "import re\n",
        "import numpy as np"
      ],
      "execution_count": 2,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "JftgWQNZeGv3"
      },
      "source": [
        "<b>2. Load the vocabulary and ProtBert Model</b>"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "70TQTgjadZm-",
        "outputId": "e0474226-5a9c-4853-fdaf-d1d19e23df9f",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 66,
          "referenced_widgets": [
            "79e0d3d58fa2429ebc16b0acc1033ff6",
            "97780d7d73c448e38539ae377add5f41",
            "6db1e1a1771941629e9415b0a58062b2",
            "d8e2a43d8ee7431bb71e7df91cb2ad96",
            "fa3841c7d34649d49eb302bb5617fc30",
            "3bd68f67d5094c958b1f751e0cdb606a",
            "ef259f39f6de43378ab5c5852802ab9d",
            "d9f6f1933058405b9f23c8122ed4fb56"
          ]
        }
      },
      "source": [
        "tokenizer = BertTokenizer.from_pretrained(\"Rostlab/prot_bert\", do_lower_case=False )"
      ],
      "execution_count": 3,
      "outputs": [
        {
          "output_type": "display_data",
          "data": {
            "application/vnd.jupyter.widget-view+json": {
              "model_id": "79e0d3d58fa2429ebc16b0acc1033ff6",
              "version_minor": 0,
              "version_major": 2
            },
            "text/plain": [
              "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=81.0, style=ProgressStyle(description_w…"
            ]
          },
          "metadata": {
            "tags": []
          }
        },
        {
          "output_type": "stream",
          "text": [
            "\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "2GcmM3pGdZnE",
        "outputId": "e84f7f65-ea6c-46bf-f18f-eccd5df6a37a",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 220,
          "referenced_widgets": [
            "31c0799296544e468ec11ef1b0d816c9",
            "8a930b8625b646d594a507520d1b4f8d",
            "08de47592ebe4968bb17944a090c8ade",
            "3ad550e4c8f54a78b9d7b3df41f835e0",
            "88005a24c63f49f5b2dc13e1e7211084",
            "76fb14cfc1e24edf9c5ac6bb92e2b99c",
            "af1acacd875140bba4d26e8429757c84",
            "659df700831b424ca944c774853346b0",
            "6fa3d541fcb749aebb160dac63ed9d6f",
            "2a7346585d874cddba907dbc4f71ea55",
            "8e031382ee764aa7bca0512ecf79b47f",
            "028b163381ae47beaf4f4d072db062d7",
            "10c837776662474a815b84d68690d163",
            "2d72e1c83f344494aa043d018687e45c",
            "9c626019a3b64bc5adcd7b0c3040144d",
            "9bd59746015f43d588361f897b882eaa"
          ]
        }
      },
      "source": [
        "model = TFBertModel.from_pretrained(\"Rostlab/prot_bert\", from_pt=True)"
      ],
      "execution_count": 4,
      "outputs": [
        {
          "output_type": "display_data",
          "data": {
            "application/vnd.jupyter.widget-view+json": {
              "model_id": "31c0799296544e468ec11ef1b0d816c9",
              "version_minor": 0,
              "version_major": 2
            },
            "text/plain": [
              "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=313.0, style=ProgressStyle(description_…"
            ]
          },
          "metadata": {
            "tags": []
          }
        },
        {
          "output_type": "stream",
          "text": [
            "\n"
          ],
          "name": "stdout"
        },
        {
          "output_type": "display_data",
          "data": {
            "application/vnd.jupyter.widget-view+json": {
              "model_id": "6fa3d541fcb749aebb160dac63ed9d6f",
              "version_minor": 0,
              "version_major": 2
            },
            "text/plain": [
              "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1684058277.0, style=ProgressStyle(descr…"
            ]
          },
          "metadata": {
            "tags": []
          }
        },
        {
          "output_type": "stream",
          "text": [
            "\n"
          ],
          "name": "stdout"
        },
        {
          "output_type": "stream",
          "text": [
            "Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']\n",
            "- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPretraining model).\n",
            "- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).\n",
            "All the weights of TFBertModel were initialized from the PyTorch model.\n",
            "If your task is similar to the task the model of the ckeckpoint was trained on, you can already use TFBertModel for predictions without further training.\n"
          ],
          "name": "stderr"
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "ZkqAotTcdZnW"
      },
      "source": [
        "<b>3. Create or load sequences and map rarely occured amino acids (U,Z,O,B) to (X)<b>"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "a0zwKinIdZnX"
      },
      "source": [
        "sequences_Example = [\"A E T C Z A O\",\"S K T Z P\"]"
      ],
      "execution_count": 5,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "EkINwL9DdZna"
      },
      "source": [
        "sequences_Example = [re.sub(r\"[UZOB]\", \"X\", sequence) for sequence in sequences_Example]"
      ],
      "execution_count": 6,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "66BZEB3MdZnf"
      },
      "source": [
        "<b>4. Tokenize, encode sequences and load it into the GPU if possibile<b>"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "xt5uYuu7dZnf"
      },
      "source": [
        "ids = tokenizer.batch_encode_plus(sequences_Example, add_special_tokens=True, padding=True, return_tensors=\"tf\")"
      ],
      "execution_count": 7,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "Grl3ieUhdZnj"
      },
      "source": [
        "input_ids = ids['input_ids']\n",
        "attention_mask = ids['attention_mask']"
      ],
      "execution_count": 8,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "Zylf1HyBdZnl"
      },
      "source": [
        "<b>5. Extracting sequences' features and load it into the CPU if needed<b>"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "i8CVGPRFdZnm"
      },
      "source": [
        "embedding = model(input_ids)[0]"
      ],
      "execution_count": 9,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "P8IsEs35T0T4"
      },
      "source": [
        "embedding = np.asarray(embedding)"
      ],
      "execution_count": 10,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "7Q2AGxlCUrot"
      },
      "source": [
        "attention_mask = np.asarray(attention_mask)"
      ],
      "execution_count": 11,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "R6oeRZ7xdZns"
      },
      "source": [
        "<b>6. Remove padding ([PAD]) and special tokens ([CLS],[SEP]) that is added by ProtBert model<b>"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "1XXoVSPDdZns"
      },
      "source": [
        "features = [] \n",
        "for seq_num in range(len(embedding)):\n",
        "    seq_len = (attention_mask[seq_num] == 1).sum()\n",
        "    seq_emd = embedding[seq_num][1:seq_len-1]\n",
        "    features.append(seq_emd)"
      ],
      "execution_count": 12,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "swOQ_7r9dZnw",
        "outputId": "504e3ce2-56ed-4080-8962-8aef8ec9346a",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 391
        }
      },
      "source": [
        "print(features)"
      ],
      "execution_count": 13,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "[array([[ 0.09233958,  0.13914403, -0.05241528, ..., -0.13947995,\n",
            "        -0.04280058,  0.07431365],\n",
            "       [ 0.1150599 ,  0.01998255, -0.08627477, ..., -0.0094623 ,\n",
            "        -0.18727243,  0.13169028],\n",
            "       [ 0.04012017,  0.05467904, -0.04245842, ...,  0.02054806,\n",
            "        -0.03819012,  0.10118145],\n",
            "       ...,\n",
            "       [ 0.08853209,  0.0228723 , -0.05121124, ..., -0.06733349,\n",
            "        -0.03013721,  0.04913726],\n",
            "       [ 0.10792378,  0.0977104 , -0.05832328, ..., -0.12765586,\n",
            "        -0.06493296,  0.12894128],\n",
            "       [ 0.05457975,  0.03635868, -0.0782121 , ..., -0.03016013,\n",
            "        -0.06015305,  0.08903901]], dtype=float32), array([[ 0.08192439,  0.04021844,  0.00467392, ...,  0.08482227,\n",
            "        -0.05522079,  0.02153459],\n",
            "       [ 0.07777947,  0.0743171 ,  0.03168098, ...,  0.09620474,\n",
            "        -0.02188272, -0.0145603 ],\n",
            "       [ 0.13320833,  0.11049384, -0.05078609, ...,  0.09844306,\n",
            "        -0.02032756,  0.05769319],\n",
            "       [ 0.02869374,  0.07422507, -0.01506015, ..., -0.05882555,\n",
            "        -0.00698154, -0.01523799],\n",
            "       [ 0.19057377,  0.03302053, -0.09203957, ...,  0.0265659 ,\n",
            "         0.07045491,  0.01709477]], dtype=float32)]\n"
          ],
          "name": "stdout"
        }
      ]
    }
  ]
}